This document summarizes Rick Gilmore’s analysis of the Jaccard index data.
The goal is to explore ways to generate an empirical “null” distribution of the Jaccard index data to compare it to the observed data.
Note: I have set eval=FALSE for a series of chunks below that generate the permuted sorting and Jaccard index data. These take many minutes to run.
/analysis/data/{P1,P31M,P3M1,P6,P6M}-sorting.csv and create new CSVs with the permuted values.analysis/make.jaccard.df.R function.n times, where n is large, probably 1,000.Let’s build and test a permutation function for the raw sorting data.
Now, let’s generate multiple permuted CSVs.
generate_n_sorting_permutations <-
function(wp_group = "P1",
n_permutations = 5) {
csv_in <- paste0("analysis/data/", wp_group, "-sorting.csv")
if (!file.exists(csv_in)) {
stop(paste0("`csv_in` not found: ", csv_in))
}
df_in <- readr::read_csv(csv_in)
df_exemplars <- df_in[, -c(1, 2, 23, 24)]
out_m <- as.matrix(df_exemplars)
for (p in 1:n_permutations) {
csv_out <-
paste0(
"analysis/data/permutation_analysis/sorting_csv/",
wp_group,
"-sorting-perm-",
stringr::str_pad(p, 3, pad = 0), ".csv"
)
for (r in 1:dim(out_m)[1]) {
new_i <- sample(1:20)
out_m[r, 1:20] <- out_m[r, new_i]
}
array_out <-
as.data.frame(cbind(df_in$Participant, df_in$Set, out_m, df_in$Set_size, df_in$Group))
# Rename!
names(array_out) <-
c("Participant",
"Set",
names(df_exemplars),
"Set_size",
"Group")
array_out
readr::write_csv(array_out, csv_out)
}
}
Then we test it.
generate_n_sorting_permutations()
Now, let’s confirm that we can calculate Jaccard indices from these data.
make_jaccard_csvs <- function(wallpaper_group = "P1",
duplicates = FALSE,
input_dir = 'analysis/data/permutation_analysis/sorting_csv/',
output_dir = 'analysis/data/permutation_analysis/jaccards/') {
# Makes a data.frame from the raw sorting data
# Load externals
source("analysis/jaccard.data.R")
source("analysis/jaccard.R")
these_csvs <-
list.files(input_dir, paste0("^", wallpaper_group, "\\-"), full.names = TRUE)
purrr::map(these_csvs,
calculate_save_jaccard_df,
wallpaper_group,
jaccard_dir = output_dir)
}
# Load a sorting permutation file, calculate the Jaccard indices, and (conditionally) save it to file.
calculate_save_jaccard_df <- function(this_csv,
wallpaper_group,
save_output = TRUE,
jaccard_dir = "analysis/data/permutation_analysis/jaccards/",
vb = FALSE) {
this_fn <- basename(this_csv)
this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
out_fn <-
paste0(jaccard_dir,
wallpaper_group,
"-jaccard-",
this_perm_number,
".csv")
this_df <- readr::read_csv(this_csv)
# Calculate Jaccard
jaccard_df <- jaccard.data(this_df)
if (save_output) {
if (vb) message(paste0('Saving ', out_fn))
readr::write_csv(jaccard_df, out_fn)
} else {
jaccard_df
}
}
make_jaccard_csvs()
generate_n_sorting_permutations("P1", n_permutations = 999)
make_jaccard_csvs("P1")
generate_n_sorting_permutations("P31M", n_permutations = 999)
make_jaccard_csvs("P31M")
generate_n_sorting_permutations("P3M1", n_permutations = 999)
make_jaccard_csvs("P3M1")
generate_n_sorting_permutations("P6", n_permutations = 999)
make_jaccard_csvs("P6")
generate_n_sorting_permutations("P6M", n_permutations = 999)
make_jaccard_csvs("P6M")
make_perm_jaccard_df <- function(this_csv) {
this_fn <- basename(this_csv)
this_perm_number <- stringr::str_extract(this_fn, "[0-9]{3}")
this_df <- readr::read_csv(this_csv)
this_df <- this_df %>%
dplyr::mutate(
.,
exemplar_pair = paste0(
stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
"-",
stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
),
perm = this_perm_number
)
this_df
}
make_aggregate_perm_jaccard_df <- function(wp_group = "P1",
input_dir = "analysis/data/permutation_analysis/jaccards",
save_csv = TRUE,
output_dir = "analysis/data/permutation_analysis/aggregates",
vb = TRUE) {
these_csvs <-
list.files(input_dir, paste0("^", wp_group, "\\-"), full.names = TRUE)
df <- purrr::map_df(these_csvs, make_perm_jaccard_df)
if (save_csv) {
out_fn <- file.path(output_dir, paste0(wp_group, "-aggregate-perm-jaccard.csv"))
if (vb) message(paste0("Saving ", out_fn))
readr::write_csv(df, out_fn)
} else {
df
}
}
Import the data.
P1_perm_df <- make_aggregate_perm_jaccard_df("P1")
P1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P1-aggregate-perm-jaccard.csv")
Visualize.
P1_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P1_perm_stats_df <- P1_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
Import the data.
P31M_perm_df <- make_aggregate_perm_jaccard_df("P31M")
P31M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P31M-aggregate-perm-jaccard.csv")
Visualize.
P31M_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P31M_perm_stats_df <- P31M_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
Import the data.
P3M1_perm_df <- make_aggregate_perm_jaccard_df("P3M1")
P3M1_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P3M1-aggregate-perm-jaccard.csv")
Visualize.
P3M1_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P3M1_perm_stats_df <- P3M1_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
Import the data.
P6_perm_df <- make_aggregate_perm_jaccard_df("P6")
P6_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6-aggregate-perm-jaccard.csv")
Visualize.
P6_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P6_perm_stats_df <- P6_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
Import the data.
P6M_perm_df <- make_aggregate_perm_jaccard_df("P6M")
P6M_perm_df <- readr::read_csv("analysis/data/permutation_analysis/aggregates/P6M-aggregate-perm-jaccard.csv")
Visualize.
P6M_perm_df %>%
ggplot2::ggplot(.) +
ggplot2::aes(x = Jaccard) +
ggplot2::geom_histogram(bins = 50)
Generate summary stats by exemplar pair.
P6M_perm_stats_df <- P6M_perm_df %>%
dplyr::group_by(., Group, exemplar_pair) %>%
dplyr::summarize(., jaccard_mean = mean(Jaccard),
jaccard_sd = sd(Jaccard))
jaccard_perm_df <- rbind(P1_perm_df, P31M_perm_df, P3M1_perm_df, P6_perm_df, P6M_perm_df)
Visualization.
jaccard_perm_df %>%
ggplot(.) +
aes(Jaccard, color = Group) +
facet_grid(Group ~ .) +
geom_boxplot(bins = 50)
jaccard_perm_df %>%
ggplot(.) +
aes(Jaccard, color = Group) +
facet_grid(Group ~ .) +
geom_boxplot(bins = 50)
jaccard_perm_df %>%
ggplot(.) +
aes(x = Group, y = Jaccard) +
geom_violin()
These plots show that the mean differences in Jaccard indices are reflected in the participants’ data are shown in the permuted data, too. This makes sense since the participants detected regularities and sorted the exemplars into different numbers of sets. In permuting the exemplar identifiers within participants, we keep some of this structure.
Let’s try aggregating the by-exemplar statistics.
jaccard_perm_stats_df <- rbind(P1_perm_stats_df, P31M_perm_stats_df, P3M1_perm_stats_df, P6_perm_stats_df, P6M_perm_stats_df)
# Sort by group, exemplar_pair
jaccard_perm_stats_df <- jaccard_perm_stats_df %>%
dplyr::arrange(Group, exemplar_pair)
jaccard_perm_stats_df %>%
ggplot(.) +
aes(x = jaccard_mean, fill = Group) +
geom_histogram() +
facet_grid(Group ~ .) +
ggtitle("Mean exemplar-pair Jaccard indices for permuted data")
jaccard_perm_stats_df %>%
ggplot(.) +
aes(x = jaccard_sd, fill = Group) +
geom_histogram() +
facet_grid(Group ~ .) +
ggtitle("Standard deviation of exemplar-pair Jaccard indices for permuted data")
Load the observed data and clean it.
jaccard_observed_df <-
readr::read_csv("analysis/data/jaccard-no-duplicates.csv")
jaccard_observed_df <- jaccard_observed_df %>%
dplyr::mutate(.,
exemplar_pair = paste0(
stringr::str_extract(Exemplar.Row, "[0-9]{3}$"),
"-",
stringr::str_extract(Exemplar.Col, "[0-9]{3}$")
)) %>%
dplyr::arrange(., Group, exemplar_pair)
Now, merge the permuted data with the observed data.
jaccard_merged_df <- dplyr::left_join(jaccard_perm_stats_df,
jaccard_observed_df,
by = c("Group", "exemplar_pair"))
# Rearrange columns for convenience
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::select(., Group, exemplar_pair, Jaccard, jaccard_mean, jaccard_sd, Exemplar.Row, Exemplar.Col)
# Rename variables for clarity
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::rename(., group = Group, jaccard_obs = Jaccard,
jaccard_emp_mean = jaccard_mean,
jaccard_emp_sd = jaccard_sd,
exemplar_row = Exemplar.Row,
exemplar_col = Exemplar.Col)
Calculate empirical z as \(z_{emp}=J_{obs}-\mu_{J}\) for each exemplar pair.
jaccard_merged_df <- jaccard_merged_df %>%
dplyr::mutate(., z_emp = (jaccard_obs-jaccard_emp_mean)/jaccard_emp_sd)
Plot a histogram of z_emp.
jaccard_merged_df %>%
ggplot(.) +
aes(z_emp, fill = group) +
geom_histogram() +
facet_grid(group ~ .)
Curiously, P31M, P6, P6M and P3M1 have exemplar pairs whose observed Jaccard indices are substantially larger than the empirically derived reference (null) distribution even though the mean Jaccard indices for P1 are the largest.
Just for fun, let’s print the exemplar pairs whose z_emp exceed 4.
jaccard_merged_df %>%
dplyr::filter(., z_emp > 4) %>%
dplyr::arrange(., group, desc(jaccard_emp_mean)) %>%
knitr::kable(.)
| group | exemplar_pair | jaccard_obs | jaccard_emp_mean | jaccard_emp_sd | exemplar_row | exemplar_col | z_emp |
|---|---|---|---|---|---|---|---|
| P1 | 008-009 | 0.4347826 | 0.1997302 | 0.0523893 | 101008 | 101009 | 4.486650 |
| P31M | 006-016 | 0.3469388 | 0.1544913 | 0.0455907 | 115006 | 115016 | 4.221202 |
| P31M | 002-020 | 0.4347826 | 0.1541413 | 0.0445330 | 115002 | 115020 | 6.301875 |
| P31M | 004-009 | 0.4666667 | 0.1536301 | 0.0458028 | 115004 | 115009 | 6.834437 |
| P31M | 008-015 | 0.3469388 | 0.1534583 | 0.0454080 | 115008 | 115015 | 4.260934 |
| P31M | 014-020 | 0.5000000 | 0.1532609 | 0.0443117 | 115014 | 115020 | 7.824999 |
| P31M | 007-020 | 0.3469388 | 0.1530137 | 0.0472758 | 115007 | 115020 | 4.101998 |
| P31M | 002-007 | 0.6500000 | 0.1528091 | 0.0456546 | 115002 | 115007 | 10.890260 |
| P31M | 002-014 | 0.4347826 | 0.1523665 | 0.0464296 | 115002 | 115014 | 6.082672 |
| P31M | 007-014 | 0.3469388 | 0.1520057 | 0.0465605 | 115007 | 115014 | 4.186665 |
| P31M | 009-014 | 0.3469388 | 0.1501004 | 0.0442869 | 115009 | 115014 | 4.444622 |
| P3M1 | 019-020 | 0.4042553 | 0.1436215 | 0.0475534 | 114019 | 114020 | 5.480869 |
| P3M1 | 003-016 | 0.3469388 | 0.1431156 | 0.0447568 | 114003 | 114016 | 4.554010 |
| P3M1 | 011-019 | 0.3750000 | 0.1415372 | 0.0445837 | 114011 | 114019 | 5.236508 |
| P6 | 001-017 | 0.3200000 | 0.1384481 | 0.0436607 | 116001 | 116017 | 4.158245 |
| P6 | 007-009 | 0.4666667 | 0.1380211 | 0.0441697 | 116007 | 116009 | 7.440517 |
| P6 | 014-017 | 0.3200000 | 0.1376545 | 0.0451294 | 116014 | 116017 | 4.040509 |
| P6 | 013-019 | 0.4888889 | 0.1373315 | 0.0433875 | 116013 | 116019 | 8.102725 |
| P6 | 005-013 | 0.3333333 | 0.1365565 | 0.0442138 | 116005 | 116013 | 4.450579 |
| P6 | 002-011 | 0.4042553 | 0.1363300 | 0.0431172 | 116002 | 116011 | 6.213881 |
| P6 | 008-009 | 0.3541667 | 0.1361674 | 0.0434391 | 116008 | 116009 | 5.018501 |
| P6 | 008-020 | 0.3265306 | 0.1359853 | 0.0425070 | 116008 | 116020 | 4.482680 |
| P6 | 006-019 | 0.4666667 | 0.1356848 | 0.0441123 | 116006 | 116019 | 7.503171 |
| P6 | 003-014 | 0.3265306 | 0.1354559 | 0.0438507 | 116003 | 116014 | 4.357394 |
| P6 | 006-013 | 0.5581395 | 0.1348380 | 0.0450430 | 116006 | 116013 | 9.397711 |
| P6M | 008-010 | 0.3333333 | 0.1448769 | 0.0455945 | 117008 | 117010 | 4.133318 |
| P6M | 008-020 | 0.3541667 | 0.1437420 | 0.0410528 | 117008 | 117020 | 5.125709 |
| P6M | 001-011 | 0.3333333 | 0.1433422 | 0.0465964 | 117001 | 117011 | 4.077378 |
| P6M | 010-020 | 0.3829787 | 0.1432528 | 0.0453747 | 117010 | 117020 | 5.283256 |